#! /usr/bin/env python
# 
# Fuzzy merge of bibliographies.
#
# A hiearchy of matching tests is implemented:
#
#	Reference type
#	Year of publication
#	Month of publication (if known)
#	Volume number (if article type)
#	Page numbers (if known)
#	Author surnames
#	Fuzzy match on title exclusing white space and punctuation characters,
#	  using Levenstein distance
#
# Really useful when you are jointly writing a paper and the authors have partially
# overlapping bib files with different cite key conventions.

# Copyright (c) 2007, Peter Corke
#
# All rights reserved.
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#     * Redistributions of source code must retain the above copyright
#       notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#     * The name of the copyright holder may not be used to endorse or 
#	promote products derived from this software without specific prior 
#	written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS''
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS AND CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.

import Bibliography;
import BibEntry;
import BibTeX;
import string;
import sys;
import optparse;

## parse switches
usage = '''usage: %prog [options] [bibfiles]

:: Fuzzy merge of bibliographies.'''
p = optparse.OptionParser(usage)
p.add_option('--dthresh', dest='dthresh', action='store', type='int',
             help='set the fuzzy match tolerance (Levenstein distance) for title string');
p.add_option('--showdup', dest='showdup', action='store_true',
             help='show information about proposed duplicates');
p.add_option('-v', '--verbose', dest='verbose', action='store_true',
             help='print some extra information');
p.set_defaults(dthresh=2, showdup=False, verbose=False);
(opts, args) = p.parse_args()
globals().update(opts.__dict__)

if len(args) == 0 and sys.stdin.isatty():
	p.print_help();
	sys.exit(0);


unique = BibTeX.BibTeX();
dupcount = 0;

def action(bib, filename):
    global dupcount, unique;
    
    if verbose:
        print >> sys.stderr,  "%d records read from %s" % (len(bib), filename)

    # for each new bib entry
    for be in bib:
        # check against all existing
        for ub in unique:
            if be.match(ub, dthresh=dthresh):
                if verbose:
                    print >> sys.stderr,  " -[%s] %s" % (be.getKey(), be);
                dupcount += 1;
                if showdup:
                    print >> sys.stderr, "============================="
                    ub.write(sys.stderr);
                    print >> sys.stderr, "---------- duplicate from %s" % bib.getFilename();
                    be.write(sys.stderr);
                break;
        else:
            if verbose:
                print >> sys.stderr,  " +[%s] %s" % (be.getKey(), be);
            unique.insertEntry(be);

## read the input files	
bib = BibTeX.BibTeX();
if args:
	for f in args:
		bib = BibTeX.BibTeX();
		bib.parseFile(f);
		action(bib, f);
else:
	bib = BibTeX.BibTeX();
	bib.parseFile();
	action(bib, '(stdin)');
			

print >> sys.stderr,  "New bib has %d records, %d duplicates found" % (len(unique), dupcount);

unique.write();
